
//-----------------------------------------------------------------------
//
// Gentrification and pioneer businesses
// Kristian Behrens, Brahim Boualam, Julien Martin, and Florian Mayneris
//
// First version: 17/02/2015
// This  version: 17/11/2021
//
// Preparation of the NHGIS census variables for New York
//
//-----------------------------------------------------------------------


// Do-file for concording geographical data, and for apportioning data
// using Census Blocks (either with population weights or proportional to the
// surface of the blocks)


// THIS IS FOR THE HOUSING VARIABLES, BROKEN DOWN FROM BLOCK GROUP TO BLOCK


cd "/Users/kristianbehrens/Desktop/GENTRI_RESTAT_FINAL/"

clear
clear matrix

set more off


// This is the list of counties for the NY MSA (we use the 2010 definition)
#delimit ; 

local listcountyNYMSA
34003	36005	36027	34013	34017	34019	36047	34023	34025	34027
36059	36061	34029	36071	34031	42103	36079	36081	36085	36087
34035	36103	34037	34039	36119;

local listvar
001 002 003 004 005;

local years
90 00 10;

#delimit cr 


// -------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 1990 BLOCKGROUP DATA

// STEP 1

insheet using "data/census/nhgis_housing_bgroup/nhgis0013_ds120_1990_blck_grp.csv", delimit(",")

gen temp1 = ""
gen temp2 = ""
gen temp3 = ""
gen temp4 = ""
gen geoid90 = ""

gen test = substr(gisjoin, 2, 100)

if (strlen(gisjoin) == 13) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 4)
	replace temp4 = substr(test, 12, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

if (strlen(gisjoin) == 15) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 6)
	replace temp4 = substr(test, 14, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

drop temp* test

sort geoid90
drop if geoid90[_n-1] == geoid90[_n]

keep geoid90 est001 es6001
ren est001 median_value90
ren es6001 median_crent90

//Keep only NYMSA counties
gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid90 median_value90 median_crent90
drop ctid temp
save "temp/temp_variables.dta", replace


clear

insheet using "data/census/nhgis_housing_bgroup/nhgis0013_ds123_1990_blck_grp.csv", delimit(",")

gen temp1 = ""
gen temp2 = ""
gen temp3 = ""
gen temp4 = ""
gen geoid90 = ""

gen test = substr(gisjoin, 2, 100)

if (strlen(gisjoin) == 13) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 4)
	replace temp4 = substr(test, 12, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

if (strlen(gisjoin) == 15) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 6)
	replace temp4 = substr(test, 14, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

drop temp* test

sort geoid90
drop if geoid90[_n-1] == geoid90[_n]

keep geoid90 ex8001 eyu001
ren ex8001 median_yearb90
ren eyu001 median_grent90

// Keep only NYMSA counties
gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid90 median_yearb90 median_grent90
drop ctid temp
save "temp/temp_variables2.dta", replace

merge 1:1 geoid90 using "temp/temp_variables.dta"

drop if _merge != 3
drop _merge

save "temp/temp_variables.dta", replace



// STEP 2


// Merge with the 1990 geography files
use "data/geography/geog_new_york/geography1990.dta", clear


// Required for 1990 data, takes care of the block suffix
gen full_bg_id = ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 2) if blocksuffix90 == ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 3) if blocksuffix90 != ""

gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid90 geoid90_back
ren full_bg_id geoid90

sort geoid90

merge m:1 geoid90 using "temp/temp_variables.dta"

tab _merge
drop if _merge == 2
drop _merge

ren geoid90 geoid90_blgroup
ren geoid90_back geoid90


gen year = 1990
keep year geoid90 median_crent90 median_value90 median_yearb90 median_grent90

ren geoid90 geoid
ren median_crent90 median_crent_block
ren median_value90 median_value_block
ren median_yearb90 median_yearb_block
ren median_grent90 median_grent_block

order geoid year median_yearb_block median_crent_block median_grent_block median_value_block

save "temp/housing1990.dta", replace


// END: MATCHING OF NHGIS 1990 BLOCKGROUP DATA
// -------------------------------------------------------



clear


// -------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2000 BLOCKGROUP DATA


// STEP 1: GET THE BLOCKGROUP DATA

insheet using "data/census/nhgis_housing_bgroup/nhgis0013_ds152_2000_blck_grp.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 6)
gen temp4 = substr(test, 14, 1)
gen geoid00 = temp1 + temp2 + temp3 + temp4


drop test temp*
keep geoid00 g68001 g74001 g8c001 g8v001
ren g68001 median_yearb00
ren g74001 median_crent00
ren g8c001 median_grent00
ren g8v001 median_value00

// Keep only NYMSA counties
gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid00 median_yearb00 median_crent00 median_grent00 median_value00
drop ctid temp
save "temp/temp_variables.dta", replace


// STEP 2: MERGE WITH THE BLOCK GEOGRAPHY FILES

use "data/geography/geog_new_york/geography2000.dta", clear

gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid00 geoid00_back
gen geoid00 = substr(geoid00_back, 1, 12)

sort geoid00

merge m:1 geoid00 using "temp/temp_variables.dta"

tab _merge
drop _merge

ren geoid00 geoid00_blgroup
ren geoid00_back geoid00

gen year = 2000
keep year geoid00 median_yearb00 median_crent00 median_grent00 median_value00

ren geoid00 geoid
ren median_crent00 median_crent_block
ren median_value00 median_value_block
ren median_yearb00 median_yearb_block
ren median_grent00 median_grent_block

order geoid year median_yearb_block median_crent_block median_grent_block median_value_block

save "temp/housing2000.dta", replace


// END: MATCHING OF NHGIS 2000 BLOCKGROUP DATA
// -------------------------------------------------------



clear



// -------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2010 BLOCKGROUP DATA

// STEP 1

insheet using "data/census/nhgis_housing_bgroup/nhgis0013_ds176_20105_2010_blck_grp.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 6)
gen temp4 = substr(test, 14, 1)
gen geoid10 = temp1 + temp2 + temp3 + temp4


drop test temp*
keep geoid10 jsee001 jsze001 js5e001 jtie001
ren jsee001 median_yearb10
ren jsze001 median_crent10
ren js5e001 median_grent10
ren jtie001 median_value10

// Keep only NYMSA counties
gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid10 median_yearb10 median_crent10 median_grent10 median_value10
drop ctid temp
save "temp/temp_variables.dta", replace


// STEP 2: Merge with the 2010 geography files

use "data/geography/geog_new_york/geography2010.dta", clear

gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid10 geoid10_back
gen geoid10 = substr(geoid10_back, 1, 12)

sort geoid10

merge m:1 geoid10 using "temp/temp_variables.dta"

tab _merge
drop _merge

ren geoid10 geoid10_blgroup
ren geoid10_back geoid10


gen year = 2010
keep year geoid10 median_yearb10 median_crent10 median_grent10 median_value10

ren geoid10 geoid
ren median_yearb10 median_yearb_block
ren median_crent10 median_crent_block
ren median_grent10 median_grent_block 
ren median_value10 median_value_block

order geoid year median_yearb_block median_crent_block median_grent_block median_value_block

save "temp/housing2010.dta", replace


// MATCHING OF NHGIS 2010 BLOCKGROUP DATA
// -------------------------------------------------------


// Assemble files

append using "temp/housing2000.dta" "temp/housing1990.dta"

sort year geoid

save "results/NewYork/census_variables_housing.dta", replace


// Adjust the geoid codes for 1990
// The 1990 block data comes from the NHGIS, with NHGIS identifiers
// The concordance data comes from the Census with the Census identifiers

// We need to transform the NHGIS identifiers (in the geography1990.dta
// files to Census identifiers)

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 3) if strlen(geoid) == 12
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 12

drop test*

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 4) if strlen(geoid) == 13
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 13

drop test*

save "results/NewYork/census_variables_housing.dta", replace

